#
#remove scientific notation
options(scipen=999)
library(stringr)
library(corrplot)
## corrplot 0.84 loaded
library(shiny)
library(lme4)
## Loading required package: Matrix
library(lmerTest)
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
load("Data/county_factors.rda")
load("Data/county_500CitiesData.rda")
data.path <- "Data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/"
# Read in the data
US.deaths <- read.csv(
paste0(data.path, "time_series_covid19_deaths_US.csv"),
header = T, stringsAsFactors = F)
US.cases <- read.csv(
paste0(data.path, "time_series_covid19_confirmed_US.csv"),
header = T, stringsAsFactors = F)
# Read in the header seprately.
US.cases.head <- read.csv(
paste0(data.path, "time_series_covid19_confirmed_US.csv"),
header = F, stringsAsFactors = F)[1,]
US.deaths.head <- read.csv(
paste0(data.path, "time_series_covid19_deaths_US.csv"),
header = F, stringsAsFactors = F)[1,]
# Correct the dates in the header to be more useable as
# column names.
proper_date <- function(dates){
dates <- sapply(dates, strsplit, split = "/")
dates <- lapply(dates, str_pad, width = 2, side = "left", pad = "0")
dates <- lapply(dates, paste, collapse = "_")
dates <- unlist(dates)
return(dates)
}
dates.cases <- proper_date(US.cases.head[-c(1:11)])
dates.deaths <- proper_date(US.deaths.head[-c(1:12)])
names(US.cases) <- c(US.cases.head[1,1:11], dates.cases)
names(US.deaths) <- c(US.deaths.head[1,1:12], dates.deaths)
if(sum(US.cases$UID != US.deaths$UID, na.rm = T) > 0){warning("COVID data rows do not match!")}
US.cases$Population <- US.deaths$Population
US.cases <- US.cases[,c(1:11, ncol(US.cases), 12:(ncol(US.cases)-1))]
data.path <- "Data/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports_us/"
daily_filenames <- list.files(data.path)
daily_filenames <- daily_filenames[daily_filenames != "README.md"]
todays_report_filename <- daily_filenames[length(daily_filenames)]
US.todaysReport <- read.csv(
paste0(data.path, todays_report_filename),
header = T, stringsAsFactors = F)
all.states <- c('Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Diamond Princess', 'District of Columbia', 'Florida', 'Georgia', 'Grand Princess', 'Guam', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virgin Islands', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming')
all.states.df <- data.frame(Province_State = all.states)
all.stats <- c("Confirmed", "Deaths", "Recovered", "Active", "Incident_Rate", "People_Tested", "People_Hospitalized", "Mortality_Rate", "Testing_Rate", "Hospitalization_Rate")
compiled.stats <- list()
for(i in 1:length(daily_filenames)){
day <- substring(daily_filenames[i],1,10)
data <- read.csv(
paste0(data.path, daily_filenames[i]),
header = T, stringsAsFactors = F)
compiled.stats[[i]] <- merge(all.states.df, data, all.y = F)
names(compiled.stats)[i] <- day
}
plot.dailyStat <- function(state, stat){
data <- sapply(1:length(daily_filenames), function(x){compiled.stats[[x]][compiled.stats[[x]]$Province_State == state, stat]})
names(data) <- daily_filenames
barplot(data, main = paste0(state, " ", stat), las = 2, cex.axis = 1, cex.names = 0.5)
}
plot.dailyStatRise <- function(state, stat){
data <- sapply(1:length(daily_filenames), function(x){compiled.stats[[x]][compiled.stats[[x]]$Province_State == state, stat]})
names(data) <- daily_filenames
rise.stat <- matrix(ncol = length(data) - 1, nrow = 1)
colnames(rise.stat) <- names(data)[-1]
for(i in 1:ncol(rise.stat) + 1){
rise <- data[i] - data[i-1]
rise.stat[i-1] <- rise
}
barplot(rise.stat, main = paste0(state, " rise in ",stat), las = 2, cex.axis = 1, cex.names = 0.5)
}
testing.data.state <- compiled.stats[[length(daily_filenames)]][, c("Province_State", "Testing_Rate")]
testing.data.state <- testing.data.state[order(testing.data.state$Testing_Rate),]
col.state <- rep("pink", nrow(testing.data.state))
avg.test.rate <- mean(testing.data.state$Testing_Rate, na.rm = T)
col.state[testing.data.state$Testing_Rate < avg.test.rate] <- "grey"
col.state[testing.data.state$Province_State == "Oklahoma"] <- "lightblue"
par(mar = c(5,6,4,2))
barplot(testing.data.state$Testing_Rate, names.arg = testing.data.state$Province_State, horiz = T, main = "Testing Rate by State", las = 2, cex.axis = 1, cex.names = 0.5, col = col.state, border = F, xlab = "Total number of people tested per 100,000 persons.")
abline(v = avg.test.rate, col = "red")
text(x = avg.test.rate + 10, y = 1, labels = "Average Testing Rate", adj = c(0, 0.5), col = "red")
Province_State - The name of the State within the USA. Country_Region - The name of the Country (US). Last_Update - The most recent date the file was pushed. Lat - Latitude. Long_ - Longitude. Confirmed - Aggregated confirmed case count for the state. Deaths - Aggregated Death case count for the state. Recovered - Aggregated Recovered case count for the state. Active - Aggregated confirmed cases that have not been resolved (Active = Confirmed - Recovered - Deaths). FIPS - Federal Information Processing Standards code that uniquely identifies counties within the USA. Incident_Rate - confirmed cases per 100,000 persons. People_Tested - Total number of people who have been tested. People_Hospitalized - Total number of people hospitalized. Mortality_Rate - Number recorded deaths * 100/ Number confirmed cases. UID - Unique Identifier for each row entry. ISO3 - Officialy assigned country code identifiers. Testing_Rate - Total number of people tested per 100,000 persons. Hospitalization_Rate - Total number of people hospitalized * 100/ Number of confirmed cases.
US.cases.info <- as.matrix(US.cases[,1:12])
US.cases.data <- as.matrix(US.cases[,-c(2:12)])
US.deaths.info <- as.matrix(US.deaths[,1:12])
US.deaths.data <- as.matrix(US.deaths[,-c(2:12)])
rownames(US.cases.info) <- US.cases.info[,1]
US.cases.info <- US.cases.info[,-1]
rownames(US.cases.data) <- US.cases.data[,1]
US.cases.data <- US.cases.data[,-1]
rownames(US.deaths.info) <- US.deaths.info[,1]
US.deaths.info <- US.deaths.info[,-1]
rownames(US.deaths.data) <- US.deaths.data[,1]
US.deaths.data <- US.deaths.data[,-1]
ndays.cases <- ncol(US.cases.data)
ndays.deaths <- ncol(US.deaths.data)
nobs <- nrow(US.cases.data)
state.curve <- function(state, stat = c("cases", "deaths"), logScale = T){
if(stat == "cases"){
data <- US.cases.data[which(US.cases$Province_State == state),]
}else if(stat == "deaths"){
data <- US.deaths.data[which(US.deaths$Province_State == state),]
}
data.sum <- colSums(data)
day.first.case <- min(which(data.sum > 0))
n.days <- length(data.sum)
if(logScale == T){
barplot(data.sum[day.first.case:n.days],
main = paste0("Total COVID-19 ", stat," by date in ", state, ", log scale"),
log = "y", las = 2, cex.axis = 1, cex.names = 0.5)
}else{
barplot(data.sum[day.first.case:n.days],
main = paste0("Total COVID-19 ", stat," by date in ", state),
las = 2, cex.axis = 1, cex.names = 0.5)
}
}
state.rise <- function(state, stat = c("cases", "deaths")){
if(stat == "cases"){
data.thisState <- US.cases.data[which(US.cases$Province_State == state),]
}else if(stat == "deaths"){
data.thisState <- US.deaths.data[which(US.deaths$Province_State == state),]
}
data.sum <- colSums(data.thisState)
n.days <- ncol(data.thisState)
rise.cases <- matrix(ncol = n.days - 1, nrow = 1)
colnames(rise.cases) <- colnames(data.thisState)[-1]
for(i in 1:ncol(rise.cases) + 1){
rise <- data.sum[i] - data.sum[i-1]
rise.cases[i-1] <- rise
}
day.first.case <- min(which(rise.cases > 0))
n.days <- length(rise.cases)
barplot(rise.cases[,day.first.case:n.days], main = paste0("Rise in COVID-19 ", stat, " by Date in ", state), las = 2, cex.axis = 1, cex.names = 0.5)
}
county.curve <- function(county, stat = c("cases", "deaths")){
if(stat == "cases"){
data <- US.cases.data[which(US.cases$Admin2 == county),]
}else if(stat == "deaths"){
data <- US.deaths.data[which(US.deaths$Admin2 == county),]
}
day.first.case <- min(which(data > 0))
n.days <- length(data)
barplot(data[day.first.case:n.days], main = paste0("Total COVID-19 ", stat," by date in ", county), log = "y", las = 2, cex.axis = 1, cex.names = 0.5)
}
county.curve("Tulsa", "cases")
county.curve("Tulsa", "deaths")
US.stats <- data.frame(UID = US.cases$UID)
cases.total <- colSums(US.cases.data)
day.first.case <- min(which(cases.total > 100))
n.days <- length(cases.total)
par(mar = c(5,5,4,2))
barplot(cases.total[day.first.case:n.days], main = "Total COVID-19 cases by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
barplot(cases.total[day.first.case:n.days], main = "Total COVID-19 cases by Date in US, log scale", las = 2, cex.axis = 1, cex.names = 0.5, log = "y")
deaths.total <- colSums(US.deaths.data)
day.first.case <- min(which(deaths.total > 0))
n.days <- length(deaths.total)
barplot(deaths.total[day.first.case:n.days], main = "Total COVID-19 deaths by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
barplot(deaths.total[day.first.case:n.days], main = "Total COVID-19 deaths by Date in US, log scale", las = 2, cex.axis = 1, cex.names = 0.5, log = "y")
avg.rise.cases
rise.cases <- matrix(ncol = ndays.cases - 1, nrow = nobs)
colnames(rise.cases) <- colnames(US.cases.data)[-1]
for(i in 1:ncol(rise.cases) + 1){
rise <- US.cases.data[,i] - US.cases.data[,i-1]
rise.cases[,i-1] <- rise
}
US.stats$avg.rise.cases <- apply(rise.cases, 1, mean)
rise.cases.total <- colSums(rise.cases)
day.first.case <- min(which(rise.cases.total > 0))
n.days <- length(rise.cases.total)
barplot(rise.cases.total[day.first.case:n.days], main = "Rise in Cases of COVID-19 by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
avg.rise.deaths
rise.deaths <- matrix(ncol = ndays.deaths - 1, nrow = nobs)
colnames(rise.deaths) <- colnames(US.deaths.data)[-1]
for(i in 1:ncol(rise.deaths) + 1){
rise <- US.deaths.data[,i] - US.deaths.data[,i-1]
rise.deaths[,i-1] <- rise
}
US.stats$avg.rise.deaths <- apply(rise.deaths, 1, mean)
rise.deaths.total <- colSums(rise.deaths)
day.first.case <- min(which(rise.deaths.total > 0))
n.days <- length(rise.deaths.total)
barplot(rise.deaths.total[day.first.case:n.days], main = "Rise in Deaths of COVID-19 by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
total.cases
US.stats$total.cases <- US.cases.data[,ndays.cases]
US.stats$total.cases.percap <- US.stats$total.cases / US.cases$Population
US.stats$total.cases.percap[US.cases$Population == 0] <- NA
hist(US.stats$total.cases.percap)
total.deaths
US.stats$total.deaths <- US.deaths.data[,ndays.deaths]
total.deaths.percap
US.stats$total.deaths.percap <- US.stats$total.deaths / US.deaths$Population
US.stats$total.deaths.percap[US.deaths$Population == 0] <- NA
max(US.stats$total.deaths.percap,na.rm = T)
## [1] 0.002823954
total.deaths.percase Error in Johns Hopkins data has rows with total.deaths > total.cases.
# pos.case.ind <- US.stats$total.cases > 0
# US.stats$total.deaths.percase[pos.case.ind] <- US.stats$total.deaths[pos.case.ind] / US.stats$total.cases[pos.case.ind]
# US.stats$total.deaths.percase[!pos.case.ind] <- 0
US.stats$total.deaths.percase <- US.stats$total.deaths / US.stats$total.cases
US.stats$total.deaths.percase[US.stats$total.cases == 0] <- NA
US.stats[which(US.stats$total.deaths > US.stats$total.cases),]
## UID avg.rise.cases avg.rise.deaths total.cases
## 3155 84080008 0.00000000 0.02173913 0
## 3203 84090002 0.00000000 0.04347826 0
## 3206 84090006 0.00000000 0.02173913 0
## 3222 84090024 0.00000000 1.15217391 0
## 3229 84090031 0.07608696 0.09782609 7
## 3230 84090032 0.04347826 0.05434783 4
## 3231 84090033 0.11956522 0.52173913 11
## 3252 84090056 0.00000000 0.06521739 0
## total.cases.percap total.deaths total.deaths.percap
## 3155 NA 2 NA
## 3203 NA 4 NA
## 3206 NA 2 NA
## 3222 NA 106 NA
## 3229 NA 9 NA
## 3230 NA 5 NA
## 3231 NA 48 NA
## 3252 NA 6 NA
## total.deaths.percase
## 3155 NA
## 3203 NA
## 3206 NA
## 3222 NA
## 3229 1.285714
## 3230 1.250000
## 3231 4.363636
## 3252 NA
US.stats$ID <- str_pad(US.stats$UID, 8, "left", pad = "0")
US.stats$ID <- substr(US.stats$ID, 4, 8)
data.merge <- merge(US.stats, county_factors, by = "ID")
data.cor <- cor(data.merge[,-c(1:2)], use = "complete.obs", method = "spearman")
corrplot.mixed(data.cor, upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)
data.merge2 <- merge(data.merge, county_500CitiesData, by = "ID", all.x = F)
data.cor2 <- cor(data.merge2[,-c(1:2)], use = "complete.obs", method = "spearman")
corrplot.mixed(data.cor2, upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)
corrplot.mixed(data.cor2[1:7,8:42], upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)
US.todaysReport.states <- US.todaysReport[!is.na(US.todaysReport$FIPS) & nchar(US.todaysReport$FIPS)<=2,]
US.todaysReport.states$FIPS <- str_pad(US.todaysReport.states$FIPS, 2, "left", pad = "0")
data.merge2$stateID <- substr(data.merge2$ID,1,2)
data.merge3 <- merge(data.merge2, US.todaysReport.states, by.x = "stateID", by.y = "FIPS")
this.lme <- lmer("total.cases.percap ~ Affluence + Singletons.in.Tract + Seniors.in.Tract + African.Americans.in.Tract + Noncitizens.in.Tract + High.BP + Binge.Drinking + Cancer + Asthma + Heart.Disease + COPD + Smoking + Diabetes + No.Physical.Activity + Obesity + Poor.Sleeping.Habits + Poor.Mental.Health + Testing_Rate + Hospitalization_Rate + (1 | stateID)", data = data.merge3)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning: Some predictor variables are on very different scales: consider
## rescaling
print(summary(this.lme), correlation=TRUE)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## "total.cases.percap ~ Affluence + Singletons.in.Tract + Seniors.in.Tract + African.Americans.in.Tract + Noncitizens.in.Tract + High.BP + Binge.Drinking + Cancer + Asthma + Heart.Disease + COPD + Smoking + Diabetes + No.Physical.Activity + Obesity + Poor.Sleeping.Habits + Poor.Mental.Health + Testing_Rate + Hospitalization_Rate + (1 | stateID)"
## Data: data.merge3
##
## REML criterion at convergence: -2506.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1929 -0.3579 -0.0560 0.2347 7.5430
##
## Random effects:
## Groups Name Variance Std.Dev.
## stateID (Intercept) 0.000002448 0.001564
## Residual 0.000006395 0.002529
## Number of obs: 313, groups: stateID, 48
##
## Fixed effects:
## Estimate Std. Error df
## (Intercept) -0.0076046210 0.0054983221 160.0186319102
## Affluence 0.0018842157 0.0005115882 276.5458521199
## Singletons.in.Tract 0.0011212158 0.0004897653 292.1050248934
## Seniors.in.Tract 0.0004748523 0.0006147572 292.0036323649
## African.Americans.in.Tract 0.0008946579 0.0005927230 292.9632731416
## Noncitizens.in.Tract 0.0006489838 0.0004615817 232.4249776226
## High.BP -0.0000086436 0.0001058599 259.1487011978
## Binge.Drinking 0.0001883236 0.0001062674 120.6867000722
## Cancer -0.0005340962 0.0006058776 223.9258066577
## Asthma -0.0001235823 0.0003639803 132.5915055333
## Heart.Disease 0.0017636366 0.0007833436 169.0580159400
## COPD -0.0004212864 0.0006188766 171.9667920206
## Smoking -0.0002118272 0.0001359251 189.8062264573
## Diabetes -0.0004722073 0.0003035396 242.3273021081
## No.Physical.Activity 0.0000587176 0.0001172461 196.0727938943
## Obesity 0.0001243364 0.0000974894 288.8872464222
## Poor.Sleeping.Habits 0.0002097653 0.0000938972 274.0698870330
## Poor.Mental.Health -0.0000151465 0.0002957100 81.1426751871
## Testing_Rate 0.0000007887 0.0000004344 44.6141502015
## Hospitalization_Rate -0.0000770806 0.0000511017 34.7373160812
## t value Pr(>|t|)
## (Intercept) -1.383 0.168567
## Affluence 3.683 0.000277 ***
## Singletons.in.Tract 2.289 0.022776 *
## Seniors.in.Tract 0.772 0.440489
## African.Americans.in.Tract 1.509 0.132274
## Noncitizens.in.Tract 1.406 0.161060
## High.BP -0.082 0.934987
## Binge.Drinking 1.772 0.078890 .
## Cancer -0.882 0.378979
## Asthma -0.340 0.734748
## Heart.Disease 2.251 0.025647 *
## COPD -0.681 0.496959
## Smoking -1.558 0.120802
## Diabetes -1.556 0.121092
## No.Physical.Activity 0.501 0.617069
## Obesity 1.275 0.203198
## Poor.Sleeping.Habits 2.234 0.026291 *
## Poor.Mental.Health -0.051 0.959275
## Testing_Rate 1.816 0.076165 .
## Hospitalization_Rate -1.508 0.140500
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of fixed effects could have been required in summary()
##
## Correlation of Fixed Effects:
## (Intr) Afflnc Sng..T Snr..T A.A..T Nnc..T Hgh.BP Bng.Dr Cancer
## Affluence -0.049
## Sngltns.n.T -0.070 0.031
## Snrs.n.Trct 0.413 0.297 0.056
## Afrcn.Am..T 0.246 0.078 -0.419 0.213
## Nnctzns.n.T -0.081 0.151 0.131 0.047 -0.183
## High.BP -0.117 0.170 0.104 0.018 -0.262 0.347
## Bing.Drnkng -0.422 -0.070 -0.222 -0.080 0.047 -0.070 0.162
## Cancer -0.488 -0.096 0.234 -0.186 -0.051 -0.090 -0.328 -0.055
## Asthma -0.277 -0.055 -0.251 -0.081 0.011 0.200 0.105 -0.001 -0.154
## Heart.Dises -0.104 0.097 -0.286 -0.121 0.195 -0.029 -0.003 0.036 -0.569
## COPD 0.509 -0.029 0.109 0.150 0.002 0.143 0.044 0.086 -0.225
## Smoking -0.064 0.089 -0.116 -0.142 -0.113 0.155 -0.100 -0.328 0.168
## Diabetes 0.086 -0.316 -0.102 -0.147 -0.206 -0.278 -0.439 0.081 0.318
## N.Physcl.Ac -0.128 0.049 0.102 0.094 0.069 -0.267 0.020 0.096 0.352
## Obesity -0.061 0.381 0.380 0.202 0.155 0.187 -0.119 -0.180 0.138
## Pr.Slpng.Hb -0.404 -0.341 0.185 -0.344 -0.347 -0.006 -0.136 0.082 0.033
## Pr.Mntl.Hlt -0.373 0.200 0.005 0.014 0.040 -0.176 0.027 0.110 0.420
## Testing_Rat 0.147 -0.149 0.014 -0.042 0.007 -0.068 -0.102 -0.054 0.000
## Hsptlztn_Rt -0.103 0.025 -0.043 -0.052 -0.008 0.003 -0.018 -0.080 -0.051
## Asthma Hrt.Ds COPD Smokng Diabts N.Ph.A Obesty Pr.S.H Pr.M.H
## Affluence
## Sngltns.n.T
## Snrs.n.Trct
## Afrcn.Am..T
## Nnctzns.n.T
## High.BP
## Bing.Drnkng
## Cancer
## Asthma
## Heart.Dises 0.383
## COPD -0.393 -0.510
## Smoking 0.123 0.062 -0.439
## Diabetes -0.166 -0.430 0.041 0.303
## N.Physcl.Ac 0.009 -0.321 0.026 -0.284 -0.217
## Obesity -0.152 -0.033 0.088 -0.217 -0.361 -0.044
## Pr.Slpng.Hb 0.033 0.262 -0.117 -0.138 -0.085 -0.146 -0.123
## Pr.Mntl.Hlt -0.352 -0.039 -0.415 -0.027 0.028 -0.022 0.038 -0.096
## Testing_Rat -0.323 -0.211 0.259 0.068 0.229 -0.139 0.063 -0.140 -0.096
## Hsptlztn_Rt -0.073 0.105 -0.065 0.054 -0.056 -0.013 0.032 -0.011 0.033
## Tstn_R
## Affluence
## Sngltns.n.T
## Snrs.n.Trct
## Afrcn.Am..T
## Nnctzns.n.T
## High.BP
## Bing.Drnkng
## Cancer
## Asthma
## Heart.Dises
## COPD
## Smoking
## Diabetes
## N.Physcl.Ac
## Obesity
## Pr.Slpng.Hb
## Pr.Mntl.Hlt
## Testing_Rat
## Hsptlztn_Rt 0.233
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
this.lme.sum <- summary(this.lme)